{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "# Multi-Label Classification Benchmark\n",
    "\n",
    "- Kashgari: 2.0.0\n",
    "- TensorFlow: 2.0.0\n"
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Macros"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%% md\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "CORPUS_PATH = '/Users/brikerman/Downloads/jigsaw-toxic-comment-classification-challenge/train.csv'"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "from kashgari.corpus import JigsawToxicCommentCorpus\n",
    "corpus = JigsawToxicCommentCorpus(CORPUS_PATH)\n",
    "\n",
    "train_x, train_y = corpus.load_data()\n",
    "test_x,  test_y  = corpus.load_data('test')\n",
    "valid_x, valid_y = corpus.load_data('valid')"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Preparing text vocab dict: 100%|██████████| 3510/3510 [00:00<00:00, 43976.22it/s]\n",
      "INFO:root:------ Build vocab dict finished, Top 10 token ------\n",
      "INFO:root:Token: [PAD]    -> 0\n",
      "INFO:root:Token: [UNK]    -> 1\n",
      "INFO:root:Token: [BOS]    -> 2\n",
      "INFO:root:Token: [EOS]    -> 3\n",
      "INFO:root:Token: .        -> 4\n",
      "INFO:root:Token: the      -> 5\n",
      "INFO:root:Token: ,        -> 6\n",
      "INFO:root:Token: \"        -> 7\n",
      "INFO:root:Token: to       -> 8\n",
      "INFO:root:Token: i        -> 9\n",
      "INFO:root:------ Build vocab dict finished, Top 10 token ------\n",
      "Preparing classification label vocab dict: 100%|██████████| 3510/3510 [00:00<00:00, 938723.91it/s]\n",
      "Calculating sequence length: 100%|██████████| 3510/3510 [00:00<00:00, 871846.92it/s]\n",
      "WARNING:root:Calculated sequence length = 282\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model: \"model_7\"\n",
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "input (InputLayer)           [(None, None)]            0         \n",
      "_________________________________________________________________\n",
      "layer_embedding (Embedding)  (None, None, 100)         666700    \n",
      "_________________________________________________________________\n",
      "bidirectional_3 (Bidirection (None, 256)               234496    \n",
      "_________________________________________________________________\n",
      "dense_3 (Dense)              (None, 6)                 1542      \n",
      "_________________________________________________________________\n",
      "activation_3 (Activation)    (None, 6)                 0         \n",
      "=================================================================\n",
      "Total params: 902,738\n",
      "Trainable params: 902,738\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n",
      "Train for 54 steps, validate for 11 steps\n",
      "Epoch 1/2\n",
      "53/54 [============================>.] - ETA: 0s - loss: 0.2300 - accuracy: 0.9547"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:root:Sequence length is None, will use the max length of the samples, which is 1038\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                      precision    recall  f1-score   support\n",
      "               toxic     0.0000    0.0000    0.0000        81\n",
      "             obscene     0.0000    0.0000    0.0000        44\n",
      "              insult     0.0000    0.0000    0.0000        46\n",
      "       identity_hate     0.0000    0.0000    0.0000        10\n",
      "        severe_toxic     0.0000    0.0000    0.0000         8\n",
      "              threat     0.0000    0.0000    0.0000         4\n",
      "           macro avg     0.0000    0.0000    0.0000       193\n",
      "\n",
      "\n",
      "epoch: 0 precision: 0.000000, recall: 0.000000, f1-score: 0.000000\n",
      "54/54 [==============================] - 46s 855ms/step - loss: 0.2279 - accuracy: 0.9551 - val_loss: 0.1627 - val_accuracy: 0.9567\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/brikerman/Desktop/python/Kashgari2/venv/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, msg_start, len(result))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 2/2\n",
      "53/54 [============================>.] - ETA: 0s - loss: 0.1440 - accuracy: 0.9619                      precision    recall  f1-score   support\n",
      "               toxic     0.0000    0.0000    0.0000        81\n",
      "             obscene     0.0000    0.0000    0.0000        44\n",
      "              insult     0.0000    0.0000    0.0000        46\n",
      "       identity_hate     0.0000    0.0000    0.0000        10\n",
      "        severe_toxic     0.0000    0.0000    0.0000         8\n",
      "              threat     0.0000    0.0000    0.0000         4\n",
      "           macro avg     0.0000    0.0000    0.0000       193\n",
      "\n",
      "\n",
      "epoch: 1 precision: 0.000000, recall: 0.000000, f1-score: 0.000000\n",
      "54/54 [==============================] - 40s 736ms/step - loss: 0.1422 - accuracy: 0.9624 - val_loss: 0.1673 - val_accuracy: 0.9573\n"
     ]
    }
   ],
   "source": [
    "import logging\n",
    "from kashgari.tasks.classification import BiLSTM_Model\n",
    "from kashgari.callbacks import EvalCallBack\n",
    "\n",
    "logging.basicConfig(level='DEBUG')\n",
    "\n",
    "model = BiLSTM_Model(multi_label=True)\n",
    "\n",
    "eval_callback = EvalCallBack(model,\n",
    "                            test_x,\n",
    "                            test_y,\n",
    "                            step=1)\n",
    "\n",
    "x = model.fit(train_x, train_y, valid_x, valid_y,\n",
    "          epochs=2,\n",
    "          callbacks=[eval_callback])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                      precision    recall  f1-score   support\n",
      "               toxic     0.0000    0.0000    0.0000       351\n",
      "             obscene     0.0000    0.0000    0.0000       182\n",
      "              insult     0.0000    0.0000    0.0000       181\n",
      "       identity_hate     0.0000    0.0000    0.0000        34\n",
      "        severe_toxic     0.0000    0.0000    0.0000        33\n",
      "              threat     0.0000    0.0000    0.0000        13\n",
      "           macro avg     0.0000    0.0000    0.0000       794\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/brikerman/Desktop/python/Kashgari2/venv/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, msg_start, len(result))\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'precision': 0.0,\n",
       " 'recall': 0.0,\n",
       " 'f1-score': 0.0,\n",
       " 'support': 794,\n",
       " 'detail': {'toxic': {'precision': 0.0,\n",
       "   'recall': 0.0,\n",
       "   'f1': 0.0,\n",
       "   'support': 351},\n",
       "  'obscene': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'support': 182},\n",
       "  'insult': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'support': 181},\n",
       "  'identity_hate': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'support': 34},\n",
       "  'severe_toxic': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'support': 33},\n",
       "  'threat': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'support': 13}}}"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.evaluate(train_x, train_y)\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "name": "python37464bitvenvvenv1fddca7c786445709a8f03a12aa91dfc",
   "language": "python",
   "display_name": "Python 3.7.4 64-bit ('venv': venv)"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}